load('./dm.Rdata')
df_raw = dm_base %>%
rename(subject_id = JiBenCID,
weight = tizhong,
height = ShenGao,
exercise_time = xiuxiansj) %>%
mutate(gender = ifelse(xingbie == 1, "Male", "Female"),
tb = ifelse(censer == 1, "No", "Yes"),
exercise = as.factor(xiuxiantl)) %>%
select(-xingbie, -censer, -xiuxiantl) %>%
janitor::clean_names()
levels(df_raw$exercise) <- list("Mild" = 1, "Medium" = 2,'heavy'=c(3,4))
df_combine = dm_base %>%
rename(
subject_id = JiBenCID,
glu_average = fastglu,
weight_initial = tizhong_1st,
weight_average = tizhong,
height = ShenGao,
glu_initial = kfxt_1st,
gender = xingbie,
district = GuanLiQX,
sys_pressure = Sbp,
dia_pressure = Dbp,
exercise_time = xiuxiansj,
exercise = xiuxiantl,
drug_insulin = insulin,
drug_oral_sulfo = sulfonylurea,
drug_oral_biguanide = biguanide,
drug_oral_glu = glu_inhib,
retina = reti,
skin = derm,
vessel = vesl,
nerve = neur,
kidney = neph,
depression = depress,
dmtime = quezhensj,
birthyear = birth_year,
birthmon = birth_mon,
dmdatayear = rucu_year,
dmdatamon = rucu_mon,
dmdataage = rucuage) %>%
mutate(
bmi_initial = weight_initial/(height/100)^2,
bmi_average = weight_average/(height/100)^2,
bmi_change = bmi_average - bmi_initial,
glu_change = glu_average - glu_initial,
tb = ifelse(censer == 1, "No", "Yes"),
exercise = as.factor(exercise),
drug_oral = case_when(drug_oral_biguanide == 0 & drug_oral_biguanide == 0 & drug_oral_glu == 0 ~0, TRUE ~ 1),
drug = case_when(drug_oral == 0 & drug_insulin ==0 ~ 0,TRUE ~ 1),
retina = as.numeric(retina),
skin = as.numeric(skin),
vessel = as.numeric(vessel),
nerve = as.numeric(nerve),
kidney = as.numeric(kidney),
complications = retina + skin + vessel + nerve + kidney + depression,
complications = as.factor(complications)
)
levels(df_combine$complications) <- list(none=0,one=1,more_than_two=c(2,6))
levels(df_combine$exercise) <- list('1' = 1, '2' = 2, '3' = c(3,4))
df_exercise = df_combine %>%
mutate(exercise = as.numeric(exercise),
total_exercise = exercise * exercise_time,
gender = as.factor(gender))
df_incidence <- df_exercise %>%
mutate(tb = fct_recode(tb, '1'= 'Yes', '0'='No')) %>%
mutate(tb=as.character(tb),
tb=as.numeric(tb)) %>%
group_by(total_exercise) %>%
summarise(tb_sum = sum(tb),
incidence = tb_sum/n()) %>%
ungroup()
df_exercise = df_exercise %>%
inner_join(df_incidence, by = "total_exercise")
df_exercise$survival = with(df_exercise, Surv(days, tb == "Yes"))
km <- survfit(survival ~ 1, data = df_exercise, conf.type = "log-log")
km_by_exercise <- survfit(survival ~ exercise, data = df_exercise, conf.type = "log-log")
km_by_exercise_time <- survfit(survival ~ exercise_time, data = df_exercise, conf.type = "log-log")
plot_exercise_level <- GGally::ggsurv(km_by_exercise, main = "Kaplan-Meier Curve for getting TB of different exercise level")
plotly::ggplotly(plot_exercise_level)
plot_exercise_time <- GGally::ggsurv(km_by_exercise_time, main = "Kaplan-Meier Curve for getting TB of different exercise time")
plotly::ggplotly(plot_exercise_time)
The plot shows that people with lower exercise level have higher probability of getting TB, which show that exercise is important to preventing TB. However, people with over 2 hours exercise time have higher probability of getting TB. That may be because that people spending more time on exercise have more probability touching other people and getting infection.
plot_exer<-ggplotly(ggplot(df_exercise, aes(x = dmage, y =total_exercise, colour=dmage)) +
geom_histogram(stat = "identity", width = .6) +
labs(title="The average exercise vs age",
x = "age") +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(face="bold", size=12),
axis.text.y = element_text(angle=0, vjust=0.5, size=10),
legend.title = element_text(size=12, face="bold"),
legend.text = element_text(size = 12, face = "bold"))+
facet_wrap(~gender))
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(plot_exer)